package detailpage.parse;

import java.io.File;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.jsoup.nodes.Document;

import parse.news.detailpage.CleanNewsDetailPage;
import parsewebpage.ParseWebPage;

import com.liyuncong.application.commontools.DoNothing;
import com.liyuncong.application.commontools.FileOperate;
import com.liyuncong.application.commontools.FileTools;

/**
 * 1.读取detailpage
 * 2.解析，保存
 * @author liyuncong
 *
 */
public class Parser {
	private final String rootDir = "D:\\program\\bigdata"
			+ "\\people.com.cn";
	private Map<File, List<File>> dirDetailPagePair = 
			new HashMap<>();
	
	{
		File root = new File(rootDir);
		FileTools.traverse(root, new FileOperate() {
			
			@Override
			public void action(File file) {
				List<File> value = new LinkedList<>();
				File[] files = file.listFiles();
				for (File file2 : files) {
					if (file2.isFile() && 
							file2.getName().length() > 10) {
						value.add(file2);
					}
				}
				if (value.size() != 0) {
					dirDetailPagePair.put(file, value);
				}
			}
		}, new DoNothing());
	}
	
	public void parse() {
		ParseWebPage parseWebPage = new CleanNewsDetailPage();
		for(File root : dirDetailPagePair.keySet()) {
			System.out.println(root);
			File saveDir = new File(root.getAbsolutePath() + 
					"\\parse\\");
			if (!saveDir.exists()) {
				saveDir.mkdir();
			}
			
			List<File> detailPages = dirDetailPagePair.get(root);
			for (File file : detailPages) {
				Object object = parseWebPage.parse(file, "utf-8");
				if (object != null) {
					Document document = (Document) object;
					FileTools.writeStringToFile(document.outerHtml(), 
							saveDir.toString() + "\\" + file.getName());
				}
			}
		}
	}
	
	public static void main(String[] args) {
		Parser parser = new Parser();
		parser.parse();
	}
}
